Our data is derived from Instagram accounts and comes from the website known as Kaggle.com. The data contains usernames, followers, followers, likes, comments, and locations for different accounts. This data is interesting because it has a large sample of different accounts where we can draw conclusions about patterns in engagement scores. In the data, we used some categories, such as followers, buckets, caption lengths, and keyword buckets.
knitr::opts_chunk$set(warning = FALSE,message = FALSE)
library(tidyverse)
library(lubridate)
library(stringr)
library(dplyr)
library(plotly)
insta_data <- read_csv("instagram_data.csv")
glimpse(insta_data)
## Rows: 11,692
## Columns: 14
## $ owner_id <chr> "36063641", "36063641", "36063641", "36063641", "36063…
## $ owner_username <chr> "christendominique", "christendominique", "christendom…
## $ shortcode <chr> "C3_GS1ASeWI", "C38ivgNS3IX", "C35-Dd9SO1b", "C33TadDM…
## $ is_video <lgl> FALSE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE,…
## $ caption <chr> "I’m a brunch & Iced Coffee girlie☕️🍳 \n\nTop @ta3 X …
## $ comments <dbl> 268, 138, 1089, 271, 145, 143, 356, 132, 128, 884, 211…
## $ likes <dbl> 16382, 9267, 10100, 6943, 17158, 9683, 42906, 4287, 74…
## $ created_at <dbl> 1709326758, 1709241048, 1709154707, 1709065322, 170871…
## $ location <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
## $ imageUrl <chr> "https://instagram.flba2-1.fna.fbcdn.net/v/t39.30808-6…
## $ multiple_images <lgl> TRUE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE,…
## $ username <chr> "christendominique", "christendominique", "christendom…
## $ followers <dbl> 2144626, 2144626, 2144626, 2144626, 2144626, 2144626, …
## $ following <dbl> 1021, 1021, 1021, 1021, 1021, 1021, 1021, 1021, 1021, …
We added new columns representing 1 as the lowest followers, 2 and 3 as the average followers and the 4 as the highest followers.
new_data<- insta_data %>% mutate(engagement = round((((likes+comments)/followers)*100),digits = 2),
follower_quantile = ntile(followers,4),
engagement_quantile = ntile(engagement,4),
post_timestamp = as_datetime(created_at),
post_time = format(round(post_timestamp,units = "hours"),format = "%H:%M"),caption_length = lengths(strsplit(caption, ' ')))
New columns with new calculated values.
glimpse(new_data)
## Rows: 11,692
## Columns: 20
## $ owner_id <chr> "36063641", "36063641", "36063641", "36063641", "3…
## $ owner_username <chr> "christendominique", "christendominique", "christe…
## $ shortcode <chr> "C3_GS1ASeWI", "C38ivgNS3IX", "C35-Dd9SO1b", "C33T…
## $ is_video <lgl> FALSE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, T…
## $ caption <chr> "I’m a brunch & Iced Coffee girlie☕️🍳 \n\nTop @ta…
## $ comments <dbl> 268, 138, 1089, 271, 145, 143, 356, 132, 128, 884,…
## $ likes <dbl> 16382, 9267, 10100, 6943, 17158, 9683, 42906, 4287…
## $ created_at <dbl> 1709326758, 1709241048, 1709154707, 1709065322, 17…
## $ location <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
## $ imageUrl <chr> "https://instagram.flba2-1.fna.fbcdn.net/v/t39.308…
## $ multiple_images <lgl> TRUE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FA…
## $ username <chr> "christendominique", "christendominique", "christe…
## $ followers <dbl> 2144626, 2144626, 2144626, 2144626, 2144626, 21446…
## $ following <dbl> 1021, 1021, 1021, 1021, 1021, 1021, 1021, 1021, 10…
## $ engagement <dbl> 0.78, 0.44, 0.52, 0.34, 0.81, 0.46, 2.02, 0.21, 0.…
## $ follower_quantile <int> 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 1, 1, 1, 1, 1,…
## $ engagement_quantile <int> 3, 2, 3, 2, 3, 2, 4, 2, 2, 4, 2, 2, 1, 3, 4, 2, 3,…
## $ post_timestamp <dttm> 2024-03-01 20:59:18, 2024-02-29 21:10:48, 2024-02…
## $ post_time <chr> "21:00", "21:00", "21:00", "20:00", "20:00", "20:0…
## $ caption_length <int> 12, 34, 81, 57, 17, 66, 50, 17, 8, 53, 17, 20, 90,…
Insights on the account follower distribution 1 is the lowest, 4 is the highest.
new_data %>% group_by(follower_quantile) %>% summarise(follower_mean = format(round(mean(followers),0),big.mark=','))
## # A tibble: 5 × 2
## follower_quantile follower_mean
## <int> <chr>
## 1 1 108,262
## 2 2 342,149
## 3 3 834,535
## 4 4 8,559,178
## 5 NA NA
Low amount of followers have the highest engagement whereas high amount of followers have the lowest engagement.
new_data %>% filter(engagement != is.na(engagement)) %>%
group_by(follower_quantile) %>% summarise(avg_eng = mean(engagement)) %>%
ggplot(aes(x = follower_quantile,y = avg_eng)) +
geom_line() + labs(x='Follower Count Quartile',y = 'Average Engagement')
new_data %>% filter(engagement != is.na(engagement)) %>% group_by(post_time) %>% summarise(mean(engagement),n())
## # A tibble: 24 × 3
## post_time `mean(engagement)` `n()`
## <chr> <dbl> <int>
## 1 00:00 2.08 261
## 2 01:00 1.88 267
## 3 02:00 1.71 238
## 4 03:00 2.99 178
## 5 04:00 2.37 138
## 6 05:00 3.34 125
## 7 06:00 2.38 145
## 8 07:00 1.59 185
## 9 08:00 3.81 223
## 10 09:00 1.93 293
## # ℹ 14 more rows
This is showing avg engagement percent by post local time.
We see the most engagement between the hours of 5AM, 8AM, 12 PM, 1 PM, 4PM and 5PM, during peak times of the day.
time_eng <- new_data %>% filter(engagement != is.na(engagement)) %>% group_by(post_time) %>% summarise(eng_mean = round(mean(engagement),1)) %>%
ggplot(aes(x = eng_mean,y = post_time,fill = as.factor(post_time))) +
geom_col(stat = 'identity') +
scale_x_continuous(labels = waiver()) +
labs(y = 'Posting Time', x = "Avg Engagement %") +
scale_fill_manual(values = c('05:00'="tomato",'08:00'="tomato",'12:00'="tomato",'13:00'="tomato",'16:00'="tomato",'17:00'="tomato"), guide = FALSE)
ggplotly(time_eng)
Highest engagement posts include captions with lengths x & y.
new_data %>% filter(engagement != is.na(engagement)) %>%
mutate(caption_bucket = case_when(caption_length < 50 ~"<50",caption_length >=50 & caption_length<100~"50-100",caption_length>=100 & caption_length < 150 ~ "100-150",caption_length>=150 & caption_length < 200 ~ "150-200",caption_length>=200 & caption_length < 250 ~ "200-250",caption_length>=250 & caption_length < 300 ~ "250-300",caption_length>=300 & caption_length < 350 ~ "300-350",caption_length > 350 ~"350+")) %>% group_by(caption_bucket) %>%
summarise(avg_eng = mean(engagement)) %>% ggplot(aes(x = caption_bucket,y = avg_eng))+geom_point(size = 5) + labs(x = "Caption Length",y = "Average Engagement")+ scale_x_discrete(limits = c("<50","50-100","100-150","150-200","200-250","250-300","300-350","350+"))
Clearly single images get more engagement whereas, carousels get less.
new_data %>% filter(engagement != is.na(engagement)) %>% mutate(type = case_when(is_video == TRUE & multiple_images == FALSE ~"Video",
is_video == FALSE & multiple_images == FALSE ~ "Picture",
multiple_images == TRUE ~ "Carousel")) %>%
group_by(type) %>% summarise(avg_eng = mean(engagement)) %>%
ggplot(aes(x = type,y = avg_eng))+
geom_col() + labs(x = 'Content Type',y = 'Average Engagement')
We see pictures get more comments and likes than videos.
new_data %>%
ggplot(aes(x = likes, y = comments, group = is_video, color = is_video)) + geom_point() +scale_y_log10() + scale_x_log10() + scale_color_manual(name = "Type", labels = c("Picture", "Video"),values = c("blue","red"))